import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
path =r"Z:\Taiwan-Customer defaults csv.csv"
print(path)
data = pd.read_csv(path)
print(data)
Z:\Taiwan-Customer defaults csv.csv
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 \
0 1 20000 2 2 1 24 2 2 -1
1 2 120000 2 2 2 26 -1 2 0
2 3 90000 2 2 2 34 0 0 0
3 4 50000 2 2 1 37 0 0 0
4 5 50000 1 2 1 57 -1 0 -1
... ... ... ... ... ... ... ... ... ...
29995 29996 220000 1 3 1 39 0 0 0
29996 29997 150000 1 3 2 43 -1 -1 -1
29997 29998 30000 1 2 2 37 4 3 2
29998 29999 80000 1 3 1 41 1 -1 0
29999 30000 50000 1 2 1 46 0 0 0
PAY_4 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 \
0 -1 ... 0 0 0 0 689
1 0 ... 3272 3455 3261 0 1000
2 0 ... 14331 14948 15549 1518 1500
3 0 ... 28314 28959 29547 2000 2019
4 0 ... 20940 19146 19131 2000 36681
... ... ... ... ... ... ... ...
29995 0 ... 88004 31237 15980 8500 20000
29996 -1 ... 8979 5190 0 1837 3526
29997 -1 ... 20878 20582 19357 0 0
29998 0 ... 52774 11855 48944 85900 3409
29999 0 ... 36535 32428 15313 2078 1800
PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
0 0 0 0 0 1
1 1000 1000 0 2000 1
2 1000 1000 1000 5000 0
3 1200 1100 1069 1000 0
4 10000 9000 689 679 0
... ... ... ... ... ...
29995 5003 3047 5000 1000 0
29996 8998 129 0 0 0
29997 22000 4200 2000 3100 1
29998 1178 1926 52964 1804 1
29999 1430 1000 1000 1000 1
[30000 rows x 25 columns]
data.head()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | ... | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | ... | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 2 | 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | ... | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
| 3 | 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | ... | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
| 4 | 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | ... | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
5 rows × 25 columns
data.tail()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29995 | 29996 | 220000 | 1 | 3 | 1 | 39 | 0 | 0 | 0 | 0 | ... | 88004 | 31237 | 15980 | 8500 | 20000 | 5003 | 3047 | 5000 | 1000 | 0 |
| 29996 | 29997 | 150000 | 1 | 3 | 2 | 43 | -1 | -1 | -1 | -1 | ... | 8979 | 5190 | 0 | 1837 | 3526 | 8998 | 129 | 0 | 0 | 0 |
| 29997 | 29998 | 30000 | 1 | 2 | 2 | 37 | 4 | 3 | 2 | -1 | ... | 20878 | 20582 | 19357 | 0 | 0 | 22000 | 4200 | 2000 | 3100 | 1 |
| 29998 | 29999 | 80000 | 1 | 3 | 1 | 41 | 1 | -1 | 0 | 0 | ... | 52774 | 11855 | 48944 | 85900 | 3409 | 1178 | 1926 | 52964 | 1804 | 1 |
| 29999 | 30000 | 50000 | 1 | 2 | 1 | 46 | 0 | 0 | 0 | 0 | ... | 36535 | 32428 | 15313 | 2078 | 1800 | 1430 | 1000 | 1000 | 1000 | 1 |
5 rows × 25 columns
print(data.shape)
(30000, 25)
print("Features of the dataset:")
data.columns
Features of the dataset:
Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
'default payment next month'],
dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30000 entries, 0 to 29999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 30000 non-null int64 1 LIMIT_BAL 30000 non-null int64 2 SEX 30000 non-null int64 3 EDUCATION 30000 non-null int64 4 MARRIAGE 30000 non-null int64 5 AGE 30000 non-null int64 6 PAY_0 30000 non-null int64 7 PAY_2 30000 non-null int64 8 PAY_3 30000 non-null int64 9 PAY_4 30000 non-null int64 10 PAY_5 30000 non-null int64 11 PAY_6 30000 non-null int64 12 BILL_AMT1 30000 non-null int64 13 BILL_AMT2 30000 non-null int64 14 BILL_AMT3 30000 non-null int64 15 BILL_AMT4 30000 non-null int64 16 BILL_AMT5 30000 non-null int64 17 BILL_AMT6 30000 non-null int64 18 PAY_AMT1 30000 non-null int64 19 PAY_AMT2 30000 non-null int64 20 PAY_AMT3 30000 non-null int64 21 PAY_AMT4 30000 non-null int64 22 PAY_AMT5 30000 non-null int64 23 PAY_AMT6 30000 non-null int64 24 default payment next month 30000 non-null int64 dtypes: int64(25) memory usage: 5.7 MB
data.nunique()
ID 30000 LIMIT_BAL 81 SEX 2 EDUCATION 7 MARRIAGE 4 AGE 56 PAY_0 11 PAY_2 11 PAY_3 11 PAY_4 11 PAY_5 10 PAY_6 10 BILL_AMT1 22723 BILL_AMT2 22346 BILL_AMT3 22026 BILL_AMT4 21548 BILL_AMT5 21010 BILL_AMT6 20604 PAY_AMT1 7943 PAY_AMT2 7899 PAY_AMT3 7518 PAY_AMT4 6937 PAY_AMT5 6897 PAY_AMT6 6939 default payment next month 2 dtype: int64
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 30000.0 | 15000.500000 | 8660.398374 | 1.0 | 7500.75 | 15000.5 | 22500.25 | 30000.0 |
| LIMIT_BAL | 30000.0 | 167484.322667 | 129747.661567 | 10000.0 | 50000.00 | 140000.0 | 240000.00 | 1000000.0 |
| SEX | 30000.0 | 1.603733 | 0.489129 | 1.0 | 1.00 | 2.0 | 2.00 | 2.0 |
| EDUCATION | 30000.0 | 1.853133 | 0.790349 | 0.0 | 1.00 | 2.0 | 2.00 | 6.0 |
| MARRIAGE | 30000.0 | 1.551867 | 0.521970 | 0.0 | 1.00 | 2.0 | 2.00 | 3.0 |
| AGE | 30000.0 | 35.485500 | 9.217904 | 21.0 | 28.00 | 34.0 | 41.00 | 79.0 |
| PAY_0 | 30000.0 | -0.016700 | 1.123802 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
| PAY_2 | 30000.0 | -0.133767 | 1.197186 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
| PAY_3 | 30000.0 | -0.166200 | 1.196868 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
| PAY_4 | 30000.0 | -0.220667 | 1.169139 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
| PAY_5 | 30000.0 | -0.266200 | 1.133187 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
| PAY_6 | 30000.0 | -0.291100 | 1.149988 | -2.0 | -1.00 | 0.0 | 0.00 | 8.0 |
| BILL_AMT1 | 30000.0 | 51223.330900 | 73635.860576 | -165580.0 | 3558.75 | 22381.5 | 67091.00 | 964511.0 |
| BILL_AMT2 | 30000.0 | 49179.075167 | 71173.768783 | -69777.0 | 2984.75 | 21200.0 | 64006.25 | 983931.0 |
| BILL_AMT3 | 30000.0 | 47013.154800 | 69349.387427 | -157264.0 | 2666.25 | 20088.5 | 60164.75 | 1664089.0 |
| BILL_AMT4 | 30000.0 | 43262.948967 | 64332.856134 | -170000.0 | 2326.75 | 19052.0 | 54506.00 | 891586.0 |
| BILL_AMT5 | 30000.0 | 40311.400967 | 60797.155770 | -81334.0 | 1763.00 | 18104.5 | 50190.50 | 927171.0 |
| BILL_AMT6 | 30000.0 | 38871.760400 | 59554.107537 | -339603.0 | 1256.00 | 17071.0 | 49198.25 | 961664.0 |
| PAY_AMT1 | 30000.0 | 5663.580500 | 16563.280354 | 0.0 | 1000.00 | 2100.0 | 5006.00 | 873552.0 |
| PAY_AMT2 | 30000.0 | 5921.163500 | 23040.870402 | 0.0 | 833.00 | 2009.0 | 5000.00 | 1684259.0 |
| PAY_AMT3 | 30000.0 | 5225.681500 | 17606.961470 | 0.0 | 390.00 | 1800.0 | 4505.00 | 896040.0 |
| PAY_AMT4 | 30000.0 | 4826.076867 | 15666.159744 | 0.0 | 296.00 | 1500.0 | 4013.25 | 621000.0 |
| PAY_AMT5 | 30000.0 | 4799.387633 | 15278.305679 | 0.0 | 252.50 | 1500.0 | 4031.50 | 426529.0 |
| PAY_AMT6 | 30000.0 | 5215.502567 | 17777.465775 | 0.0 | 117.75 | 1500.0 | 4000.00 | 528666.0 |
| default payment next month | 30000.0 | 0.221200 | 0.415062 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
data.isna().sum()
data.isnull().sum()
ID 0 LIMIT_BAL 0 SEX 0 EDUCATION 0 MARRIAGE 0 AGE 0 PAY_0 0 PAY_2 0 PAY_3 0 PAY_4 0 PAY_5 0 PAY_6 0 BILL_AMT1 0 BILL_AMT2 0 BILL_AMT3 0 BILL_AMT4 0 BILL_AMT5 0 BILL_AMT6 0 PAY_AMT1 0 PAY_AMT2 0 PAY_AMT3 0 PAY_AMT4 0 PAY_AMT5 0 PAY_AMT6 0 default payment next month 0 dtype: int64
missing = pd.DataFrame((data.isnull().sum())*100/data.shape[0]).reset_index()
plt.figure(figsize=(16,5))
ax = sns.pointplot('index',0,data=missing)
plt.xticks(rotation =90,fontsize =7)
plt.title("Percentage of Missing values")
plt.ylabel("PERCENTAGE")
plt.show()
value=len(data[data.duplicated()])
print("The number of duplicate values in the data set is = ",value)
The number of duplicate values in the data set is = 0
data.rename(columns={'default payment next month' : 'default_payment_next_month'}, inplace=True)
data['default_payment_next_month'].value_counts()
0 23364 1 6636 Name: default_payment_next_month, dtype: int64
data['default_payment_next_month'].value_counts(normalize=True)
0 0.7788 1 0.2212 Name: default_payment_next_month, dtype: float64
#plotting the count plot to vizualize the data distribution
#plot the count plot to check the data distribution
plt.figure(figsize=(10,5))
sns.countplot(x = 'default_payment_next_month', data = data)
<AxesSubplot:xlabel='default_payment_next_month', ylabel='count'>
data['SEX'].value_counts()
2 18112 1 11888 Name: SEX, dtype: int64
plt.figure(figsize=(10,5))
sns.countplot(x = 'SEX', data = data)
<AxesSubplot:xlabel='SEX', ylabel='count'>
data['EDUCATION'].value_counts()
2 14030 1 10585 3 4917 5 280 4 123 6 51 0 14 Name: EDUCATION, dtype: int64
data["EDUCATION"] = data["EDUCATION"].replace({4:0,5:0,6:0})
data["EDUCATION"].value_counts()
2 14030 1 10585 3 4917 0 468 Name: EDUCATION, dtype: int64
plt.figure(figsize=(10,5))
sns.countplot(x = 'EDUCATION', data = data)
<AxesSubplot:xlabel='EDUCATION', ylabel='count'>
data["MARRIAGE"].unique()
array([1, 2, 3, 0], dtype=int64)
data['MARRIAGE'].value_counts()
2 15964 1 13659 3 323 0 54 Name: MARRIAGE, dtype: int64
data["MARRIAGE"].value_counts(normalize=True)
2 0.532133 1 0.455300 3 0.010767 0 0.001800 Name: MARRIAGE, dtype: float64
data["MARRIAGE"] = data["MARRIAGE"].replace({0:3})
data["MARRIAGE"].value_counts(normalize=True)
2 0.532133 1 0.455300 3 0.012567 Name: MARRIAGE, dtype: float64
plt.figure(figsize=(10,5))
sns.countplot(x = 'MARRIAGE', data=data)
<AxesSubplot:xlabel='MARRIAGE', ylabel='count'>
data['AGE'].value_counts()
29 1605 27 1477 28 1409 30 1395 26 1256 31 1217 25 1186 34 1162 32 1158 33 1146 24 1127 35 1113 36 1108 37 1041 39 954 38 944 23 931 40 870 41 824 42 794 44 700 43 670 45 617 46 570 22 560 47 501 48 466 49 452 50 411 51 340 53 325 52 304 54 247 55 209 56 178 58 122 57 122 59 83 60 67 21 67 61 56 62 44 63 31 64 31 66 25 65 24 67 16 69 15 70 10 68 5 73 4 72 3 75 3 71 3 79 1 74 1 Name: AGE, dtype: int64
data.groupby('default_payment_next_month')['AGE'].mean()
default_payment_next_month 0 35.417266 1 35.725738 Name: AGE, dtype: float64
data= data.astype('int')
plt.figure(figsize=(15,7))
sns.countplot(x = 'AGE', data = data)
plt.show()
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="default_payment_next_month", y="AGE", data=data)
data['LIMIT_BAL'].describe()
count 30000.000000 mean 167484.322667 std 129747.661567 min 10000.000000 25% 50000.000000 50% 140000.000000 75% 240000.000000 max 1000000.000000 Name: LIMIT_BAL, dtype: float64
plt.figure(figsize=(10,5))
sns.distplot(data['LIMIT_BAL'], kde=True)
plt.show()
sns.barplot(x='default_payment_next_month', y='LIMIT_BAL', data=data)
<AxesSubplot:xlabel='default_payment_next_month', ylabel='LIMIT_BAL'>
plt.figure(figsize=(10,10))
ax = sns.boxplot(x="default_payment_next_month", y="LIMIT_BAL", data=data)
data.rename(columns={'PAY_0':'PAY_SEPT','PAY_2':'PAY_AUG','PAY_3':'PAY_JUL','PAY_4':'PAY_JUN','PAY_5':'PAY_MAY','PAY_6':'PAY_APR'},inplace=True)
data.rename(columns={'BILL_AMT1':'BILL_AMT_SEPT','BILL_AMT2':'BILL_AMT_AUG','BILL_AMT3':'BILL_AMT_JUL','BILL_AMT4':'BILL_AMT_JUN','BILL_AMT5':'BILL_AMT_MAY','BILL_AMT6':'BILL_AMT_APR'}, inplace = True)
data.rename(columns={'PAY_AMT1':'PAY_AMT_SEPT','PAY_AMT2':'PAY_AMT_AUG','PAY_AMT3':'PAY_AMT_JUL','PAY_AMT4':'PAY_AMT_JUN','PAY_AMT5':'PAY_AMT_MAY','PAY_AMT6':'PAY_AMT_APR'},inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30000 entries, 0 to 29999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 30000 non-null int32 1 LIMIT_BAL 30000 non-null int32 2 SEX 30000 non-null int32 3 EDUCATION 30000 non-null int32 4 MARRIAGE 30000 non-null int32 5 AGE 30000 non-null int32 6 PAY_SEPT 30000 non-null int32 7 PAY_AUG 30000 non-null int32 8 PAY_JUL 30000 non-null int32 9 PAY_JUN 30000 non-null int32 10 PAY_MAY 30000 non-null int32 11 PAY_APR 30000 non-null int32 12 BILL_AMT_SEPT 30000 non-null int32 13 BILL_AMT_AUG 30000 non-null int32 14 BILL_AMT_JUL 30000 non-null int32 15 BILL_AMT_JUN 30000 non-null int32 16 BILL_AMT_MAY 30000 non-null int32 17 BILL_AMT_APR 30000 non-null int32 18 PAY_AMT_SEPT 30000 non-null int32 19 PAY_AMT_AUG 30000 non-null int32 20 PAY_AMT_JUL 30000 non-null int32 21 PAY_AMT_JUN 30000 non-null int32 22 PAY_AMT_MAY 30000 non-null int32 23 PAY_AMT_APR 30000 non-null int32 24 default_payment_next_month 30000 non-null int32 dtypes: int32(25) memory usage: 2.9 MB
data.head()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_SEPT | PAY_AUG | PAY_JUL | PAY_JUN | ... | BILL_AMT_JUN | BILL_AMT_MAY | BILL_AMT_APR | PAY_AMT_SEPT | PAY_AMT_AUG | PAY_AMT_JUL | PAY_AMT_JUN | PAY_AMT_MAY | PAY_AMT_APR | default_payment_next_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | ... | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | ... | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 2 | 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | ... | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
| 3 | 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | ... | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
| 4 | 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | ... | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
5 rows × 25 columns
total_bill_amnt_df = data[['BILL_AMT_SEPT', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_APR']]
sns.pairplot(data = total_bill_amnt_df)
<seaborn.axisgrid.PairGrid at 0x27311f253a0>
#plotting the count plot for Previous payment status
pre_payment = ['PAY_SEPT', 'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR']
for col in pre_payment:
plt.figure(figsize=(10,5))
sns.countplot(x = col, hue = 'default_payment_next_month', data = data)
pay_amnt_df = data[['PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_APR', 'default_payment_next_month']]
sns.pairplot(data = pay_amnt_df, hue='default_payment_next_month')
<seaborn.axisgrid.PairGrid at 0x2731645ef70>
data.shape
(30000, 25)
x,y = 'SEX', 'default_payment_next_month'
(data
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
<seaborn.axisgrid.FacetGrid at 0x27316535d00>
x,y = 'EDUCATION', 'default_payment_next_month'
(data
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
<seaborn.axisgrid.FacetGrid at 0x27317255070>
x,y = 'MARRIAGE', 'default_payment_next_month'
(data
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
<seaborn.axisgrid.FacetGrid at 0x2731733bfd0>
plt.figure(figsize=(19,7))
sns.barplot(x = 'AGE', y = 'default_payment_next_month', data = data)
plt.show()
plt.figure(figsize=(20,15))
sns.heatmap(data.corr(),annot=True,cmap="coolwarm")
<AxesSubplot:>
from imblearn.over_sampling import SMOTE
smote = SMOTE()
# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(data.iloc[:,0:-1], data['default_payment_next_month'])
print('Original dataset shape', len(data))
print('Resampled dataset shape', len(y_smote))
Original dataset shape 30000 Resampled dataset shape 46728
x_smote
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_SEPT | PAY_AUG | PAY_JUL | PAY_JUN | ... | BILL_AMT_JUL | BILL_AMT_JUN | BILL_AMT_MAY | BILL_AMT_APR | PAY_AMT_SEPT | PAY_AMT_AUG | PAY_AMT_JUL | PAY_AMT_JUN | PAY_AMT_MAY | PAY_AMT_APR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | ... | 689 | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 |
| 1 | 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | ... | 2682 | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 |
| 2 | 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | ... | 13559 | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 |
| 3 | 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | ... | 49291 | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 |
| 4 | 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | ... | 35835 | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46723 | 13623 | 396156 | 2 | 1 | 1 | 28 | -2 | -1 | -1 | -1 | ... | 5711 | 15896 | 61653 | 19694 | 7542 | 5740 | 15969 | 61823 | 13134 | 9864 |
| 46724 | 20354 | 50000 | 1 | 1 | 2 | 23 | 0 | 0 | 0 | 0 | ... | 3585 | 3846 | 1316 | 1284 | 1073 | 1242 | 3812 | 1272 | 9 | 9 |
| 46725 | 4218 | 10000 | 1 | 1 | 1 | 31 | 1 | 0 | 0 | 0 | ... | 9343 | 9136 | 9930 | 9368 | 1840 | 1373 | 580 | 1300 | 0 | 844 |
| 46726 | 19207 | 70000 | 1 | 1 | 2 | 32 | 2 | 0 | 0 | 0 | ... | 37192 | 38272 | 38916 | 39837 | 2095 | 2000 | 1721 | 1300 | 1559 | 1514 |
| 46727 | 26403 | 40000 | 1 | 1 | 2 | 26 | 1 | 0 | 0 | 2 | ... | 40559 | 40981 | 39173 | 38831 | 2000 | 3905 | 1422 | 51 | 1517 | 1517 |
46728 rows × 24 columns
columns = list(data.columns)
columns.pop()
'default_payment_next_month'
balance_df = pd.DataFrame(x_smote, columns=columns)
balance_df['default_payment_next_month'] = y_smote
sns.countplot('default_payment_next_month', data = balance_df)
<AxesSubplot:xlabel='default_payment_next_month', ylabel='count'>
balance_df[balance_df['default_payment_next_month']==1]
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_SEPT | PAY_AUG | PAY_JUL | PAY_JUN | ... | BILL_AMT_JUN | BILL_AMT_MAY | BILL_AMT_APR | PAY_AMT_SEPT | PAY_AMT_AUG | PAY_AMT_JUL | PAY_AMT_JUN | PAY_AMT_MAY | PAY_AMT_APR | default_payment_next_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | ... | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | ... | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
| 13 | 14 | 70000 | 1 | 2 | 2 | 30 | 1 | 2 | 2 | 0 | ... | 66782 | 36137 | 36894 | 3200 | 0 | 3000 | 3000 | 1500 | 0 | 1 |
| 16 | 17 | 20000 | 1 | 1 | 2 | 24 | 0 | 0 | 2 | 2 | ... | 18338 | 17905 | 19104 | 3200 | 0 | 1500 | 0 | 1650 | 0 | 1 |
| 21 | 22 | 120000 | 2 | 2 | 1 | 39 | -1 | -1 | -1 | -1 | ... | 0 | 632 | 316 | 316 | 316 | 0 | 632 | 316 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46723 | 13623 | 396156 | 2 | 1 | 1 | 28 | -2 | -1 | -1 | -1 | ... | 15896 | 61653 | 19694 | 7542 | 5740 | 15969 | 61823 | 13134 | 9864 | 1 |
| 46724 | 20354 | 50000 | 1 | 1 | 2 | 23 | 0 | 0 | 0 | 0 | ... | 3846 | 1316 | 1284 | 1073 | 1242 | 3812 | 1272 | 9 | 9 | 1 |
| 46725 | 4218 | 10000 | 1 | 1 | 1 | 31 | 1 | 0 | 0 | 0 | ... | 9136 | 9930 | 9368 | 1840 | 1373 | 580 | 1300 | 0 | 844 | 1 |
| 46726 | 19207 | 70000 | 1 | 1 | 2 | 32 | 2 | 0 | 0 | 0 | ... | 38272 | 38916 | 39837 | 2095 | 2000 | 1721 | 1300 | 1559 | 1514 | 1 |
| 46727 | 26403 | 40000 | 1 | 1 | 2 | 26 | 1 | 0 | 0 | 2 | ... | 40981 | 39173 | 38831 | 2000 | 3905 | 1422 | 51 | 1517 | 1517 | 1 |
23364 rows × 25 columns
credit_df_copy = balance_df.copy()
credit_df_copy['total_Payement_Value'] = credit_df_copy['PAY_SEPT'] + credit_df_copy['PAY_AUG'] + credit_df_copy['PAY_JUL'] + credit_df_copy['PAY_JUN'] + credit_df_copy['PAY_MAY'] + credit_df_copy['PAY_APR']
credit_df_copy.groupby('default_payment_next_month')['total_Payement_Value'].mean()
default_payment_next_month 0 -1.980140 1 1.682332 Name: total_Payement_Value, dtype: float64
plt.figure(figsize=(10,10))
sns.boxplot(data = credit_df_copy, x = 'default_payment_next_month', y = 'total_Payement_Value' )
<AxesSubplot:xlabel='default_payment_next_month', ylabel='total_Payement_Value'>
credit_df_copy['Dues'] = (credit_df_copy['BILL_AMT_APR']+credit_df_copy['BILL_AMT_MAY']+credit_df_copy['BILL_AMT_JUN']+credit_df_copy['BILL_AMT_JUL']+credit_df_copy['BILL_AMT_SEPT'])-(credit_df_copy['PAY_AMT_APR']+credit_df_copy['PAY_AMT_MAY']+credit_df_copy['PAY_AMT_JUN']+credit_df_copy['PAY_AMT_JUL']+credit_df_copy['PAY_AMT_AUG']+credit_df_copy['PAY_AMT_SEPT'])
credit_df_copy.groupby('default_payment_next_month')['Dues'].mean()
default_payment_next_month 0 187742.051532 1 192413.576956 Name: Dues, dtype: float64
credit_df_copy.replace({'SEX': {1 : 'MALE', 2 : 'FEMALE'},
'EDUCATION' : {1 : 'graduate school', 2 : 'university', 3 : 'high school', 0 : 'others'},
'MARRIAGE' : {1 : 'married', 2 : 'single', 3 : 'others'}}, inplace = True)
credit_df_copy.head()
| ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_SEPT | PAY_AUG | PAY_JUL | PAY_JUN | ... | BILL_AMT_APR | PAY_AMT_SEPT | PAY_AMT_AUG | PAY_AMT_JUL | PAY_AMT_JUN | PAY_AMT_MAY | PAY_AMT_APR | default_payment_next_month | total_Payement_Value | Dues | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 20000 | FEMALE | university | married | 24 | 2 | 2 | -1 | -1 | ... | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 | -2 | 3913 |
| 1 | 2 | 120000 | FEMALE | university | single | 26 | -1 | 2 | 0 | 0 | ... | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 | 3 | 10352 |
| 2 | 3 | 90000 | FEMALE | university | single | 34 | 0 | 0 | 0 | 0 | ... | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 | 0 | 76608 |
| 3 | 4 | 50000 | FEMALE | university | married | 37 | 0 | 0 | 0 | 0 | ... | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 | 0 | 174713 |
| 4 | 5 | 50000 | MALE | university | married | 57 | -1 | 0 | -1 | 0 | ... | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 | -2 | 44620 |
5 rows × 27 columns
credit_df_copy = pd.get_dummies(credit_df_copy,columns=['EDUCATION','MARRIAGE'])
credit_df_copy = pd.get_dummies(credit_df_copy, columns = ['PAY_SEPT', 'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR'], drop_first = True )
encoders_nums = {
"SEX":{"FEMALE": 0, "MALE": 1}
}
credit_df_copy = credit_df_copy.replace(encoders_nums)
credit_df_copy.drop('ID',axis = 1, inplace = True)
credit_df_copy.columns
Index(['LIMIT_BAL', 'SEX', 'AGE', 'BILL_AMT_SEPT', 'BILL_AMT_AUG',
'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_APR',
'PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_JUN',
'PAY_AMT_MAY', 'PAY_AMT_APR', 'default_payment_next_month',
'total_Payement_Value', 'Dues', 'EDUCATION_graduate school',
'EDUCATION_high school', 'EDUCATION_others', 'EDUCATION_university',
'MARRIAGE_married', 'MARRIAGE_others', 'MARRIAGE_single', 'PAY_SEPT_-1',
'PAY_SEPT_0', 'PAY_SEPT_1', 'PAY_SEPT_2', 'PAY_SEPT_3', 'PAY_SEPT_4',
'PAY_SEPT_5', 'PAY_SEPT_6', 'PAY_SEPT_7', 'PAY_SEPT_8', 'PAY_AUG_-1',
'PAY_AUG_0', 'PAY_AUG_1', 'PAY_AUG_2', 'PAY_AUG_3', 'PAY_AUG_4',
'PAY_AUG_5', 'PAY_AUG_6', 'PAY_AUG_7', 'PAY_AUG_8', 'PAY_JUL_-1',
'PAY_JUL_0', 'PAY_JUL_1', 'PAY_JUL_2', 'PAY_JUL_3', 'PAY_JUL_4',
'PAY_JUL_5', 'PAY_JUL_6', 'PAY_JUL_7', 'PAY_JUL_8', 'PAY_JUN_-1',
'PAY_JUN_0', 'PAY_JUN_1', 'PAY_JUN_2', 'PAY_JUN_3', 'PAY_JUN_4',
'PAY_JUN_5', 'PAY_JUN_6', 'PAY_JUN_7', 'PAY_JUN_8', 'PAY_MAY_-1',
'PAY_MAY_0', 'PAY_MAY_1', 'PAY_MAY_2', 'PAY_MAY_3', 'PAY_MAY_4',
'PAY_MAY_5', 'PAY_MAY_6', 'PAY_MAY_7', 'PAY_MAY_8', 'PAY_APR_-1',
'PAY_APR_0', 'PAY_APR_1', 'PAY_APR_2', 'PAY_APR_3', 'PAY_APR_4',
'PAY_APR_5', 'PAY_APR_6', 'PAY_APR_7', 'PAY_APR_8'],
dtype='object')
credit_df_copy.shape
(46728, 85)
credit_df_copy.head()
| LIMIT_BAL | SEX | AGE | BILL_AMT_SEPT | BILL_AMT_AUG | BILL_AMT_JUL | BILL_AMT_JUN | BILL_AMT_MAY | BILL_AMT_APR | PAY_AMT_SEPT | ... | PAY_APR_-1 | PAY_APR_0 | PAY_APR_1 | PAY_APR_2 | PAY_APR_3 | PAY_APR_4 | PAY_APR_5 | PAY_APR_6 | PAY_APR_7 | PAY_APR_8 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20000 | 0 | 24 | 3913 | 3102 | 689 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 120000 | 0 | 26 | 2682 | 1725 | 2682 | 3272 | 3455 | 3261 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 90000 | 0 | 34 | 29239 | 14027 | 13559 | 14331 | 14948 | 15549 | 1518 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 50000 | 0 | 37 | 46990 | 48233 | 49291 | 28314 | 28959 | 29547 | 2000 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 50000 | 1 | 57 | 8617 | 5670 | 35835 | 20940 | 19146 | 19131 | 2000 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 85 columns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
#logisticregresion
credit_df_logistic = credit_df_copy.copy()
credit_df_logistic.head()
| LIMIT_BAL | SEX | AGE | BILL_AMT_SEPT | BILL_AMT_AUG | BILL_AMT_JUL | BILL_AMT_JUN | BILL_AMT_MAY | BILL_AMT_APR | PAY_AMT_SEPT | ... | PAY_APR_-1 | PAY_APR_0 | PAY_APR_1 | PAY_APR_2 | PAY_APR_3 | PAY_APR_4 | PAY_APR_5 | PAY_APR_6 | PAY_APR_7 | PAY_APR_8 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20000 | 0 | 24 | 3913 | 3102 | 689 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 120000 | 0 | 26 | 2682 | 1725 | 2682 | 3272 | 3455 | 3261 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 90000 | 0 | 34 | 29239 | 14027 | 13559 | 14331 | 14948 | 15549 | 1518 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 50000 | 0 | 37 | 46990 | 48233 | 49291 | 28314 | 28959 | 29547 | 2000 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 50000 | 1 | 57 | 8617 | 5670 | 35835 | 20940 | 19146 | 19131 | 2000 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 85 columns
X = credit_df_logistic.drop(['default_payment_next_month','total_Payement_Value','Dues'],axis=1)
y = credit_df_logistic['default_payment_next_month']
columns = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
param_grid = {'penalty':['l1','l2'], 'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
grid_lr_clf = GridSearchCV(LogisticRegression(), param_grid, scoring = 'accuracy', n_jobs = -1, verbose = 3, cv = 3)
grid_lr_clf.fit(X_train, y_train)
Fitting 3 folds for each of 14 candidates, totalling 42 fits
GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'penalty': ['l1', 'l2']},
scoring='accuracy', verbose=3)
optimized_clf = grid_lr_clf.best_estimator_
grid_lr_clf.best_params_
{'C': 10, 'penalty': 'l2'}
grid_lr_clf.best_score_
0.7527391545575073
train_preds = optimized_clf.predict_proba(X_train)[:,1]
test_preds = optimized_clf.predict_proba(X_test)[:,1]
train_class_preds = optimized_clf.predict(X_train)
test_class_preds = optimized_clf.predict(X_test)
# Get the accuracy scores
train_accuracy_lr = accuracy_score(train_class_preds,y_train)
test_accuracy_lr = accuracy_score(test_class_preds,y_test)
print("The accuracy on train data is ", train_accuracy_lr)
print("The accuracy on test data is ", test_accuracy_lr)
The accuracy on train data is 0.7538250231577602 The accuracy on test data is 0.753323390182219
#print the accuracy,precission,recall,f1,roc_score
test_accuracy_lr = accuracy_score(test_class_preds,y_test)
test_precision_score_lr = precision_score(test_class_preds,y_test)
test_recall_score_lr = recall_score(test_class_preds,y_test)
test_f1_score_lr = f1_score(test_class_preds,y_test)
test_roc_score_lr = roc_auc_score(test_class_preds,y_test)
print("The accuracy on test data is ", test_accuracy_lr)
print("The precision on test data is ", test_precision_score_lr)
print("The recall on test data is ", test_recall_score_lr)
print("The f1 on test data is ", test_f1_score_lr)
print("The roc_score on test data is ", test_roc_score_lr)
The accuracy on test data is 0.753323390182219 The precision on test data is 0.6878080415045396 The recall on test data is 0.7914925373134328 The f1 on test data is 0.7360166551006245 The roc_score on test data is 0.7577460393252177
cm_lr = confusion_matrix(y_train, train_class_preds)
print(cm_lr)
[[12821 2832] [ 4875 10779]]
labels = ['Not Defaulter', 'Defaulter']
ax= plt.subplot()
sns.heatmap(cm_lr, annot=True, ax = ax) #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
[Text(0, 0.5, 'Not Defaulter'), Text(0, 1.5, 'Defaulter')]
feature_importance = pd.DataFrame({'Features':columns, 'Importance':np.abs(optimized_clf.coef_).ravel() })
feature_importance = feature_importance.sort_values(by = 'Importance', ascending=False)[:10]
plt.bar(height=feature_importance['Importance'], x= feature_importance['Features'])
plt.xticks(rotation=80)
plt.title("Feature importances via coefficients")
plt.show()
y_preds_proba_lr = optimized_clf.predict_proba(X_test)[::,1]
y_pred_proba = y_preds_proba_lr
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
def G(v):
bins = np.linspace(0., 100., 11)
total = float(np.sum(v))
yvals = []
for b in bins:
bin_vals = v[v <= np.percentile(v, b)]
bin_fraction = (np.sum(bin_vals) / total) * 100.0
yvals.append(bin_fraction)
# perfect equality area
pe_area = np.trapz(bins, x=bins)
# lorenz area
lorenz_area = np.trapz(yvals, x=bins)
gini_val = (pe_area - lorenz_area) / float(pe_area)
return bins, yvals, gini_val
bins, result, gini_val = G(y_preds_proba_lr)
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(bins, result, label="observed")
plt.plot(bins, bins, '--', label="perfect eq.")
plt.xlabel("fraction of population")
plt.ylabel("fraction of wealth")
plt.title("GINI: %.4f" %(gini_val))
Text(0.5, 1.0, 'GINI: 0.3203')
from sklearn.ensemble import RandomForestClassifier
X = credit_df_copy.drop(['default_payment_next_month','total_Payement_Value','Dues'],axis=1)
y = credit_df_copy['default_payment_next_month']
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
RandomForestClassifier()
train_class_preds = rf_clf.predict(X_train)
test_class_preds = rf_clf.predict(X_test)
train_accuracy_rf = accuracy_score(train_class_preds,y_train)
test_accuracy_rf = accuracy_score(test_class_preds,y_test)
print("The accuracy on train data is ", train_accuracy_rf)
print("The accuracy on test data is ", test_accuracy_rf)
The accuracy on train data is 0.999361165234612 The accuracy on test data is 0.8314635886129305
test_accuracy_rf = accuracy_score(test_class_preds,y_test)
test_precision_score_rf = precision_score(test_class_preds,y_test)
test_recall_score_rf = recall_score(test_class_preds,y_test)
test_f1_score_rf = f1_score(test_class_preds,y_test)
test_roc_score_rf = roc_auc_score(test_class_preds,y_test)
print("The accuracy on test data is ", test_accuracy_rf)
print("The precision on test data is ", test_precision_score_rf)
print("The recall on test data is ", test_recall_score_rf)
print("The f1 on test data is ", test_f1_score_rf)
print("The roc_score on test data is ", test_roc_score_rf)
The accuracy on test data is 0.8314635886129305 The precision on test data is 0.7989623865110247 The recall on test data is 0.8544874462477459 The f1 on test data is 0.8257926134459415 The roc_score on test data is 0.8328696364214861
param_grid = {'n_estimators': [100,150,200], 'max_depth': [10,20,30]}
grid_rf_clf = GridSearchCV(RandomForestClassifier(), param_grid, scoring = 'accuracy', n_jobs = -1, verbose = 3, cv = 3)
grid_rf_clf.fit(X_train, y_train)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
param_grid={'max_depth': [10, 20, 30],
'n_estimators': [100, 150, 200]},
scoring='accuracy', verbose=3)
grid_rf_clf.best_score_
0.8233941501745736
grid_rf_clf.best_params_
{'max_depth': 30, 'n_estimators': 150}
optimal_rf_clf = grid_rf_clf.best_estimator_
train_class_preds = optimal_rf_clf.predict(X_train)
test_class_preds = optimal_rf_clf.predict(X_test)
train_accuracy_rf = accuracy_score(train_class_preds,y_train)
test_accuracy_rf = accuracy_score(test_class_preds,y_test)
print("The accuracy on train data is ", train_accuracy_rf)
print("The accuracy on test data is ", test_accuracy_rf)
The accuracy on train data is 0.9984667965630689 The accuracy on test data is 0.8323065949030543
test_accuracy_rf = accuracy_score(test_class_preds,y_test)
test_precision_score_rf = precision_score(test_class_preds,y_test)
test_recall_score_rf = recall_score(test_class_preds,y_test)
test_f1_score_rf = f1_score(test_class_preds,y_test)
test_roc_score_rf = roc_auc_score(test_class_preds,y_test)
print("The accuracy on test data is ", test_accuracy_rf)
print("The precision on test data is ", test_precision_score_rf)
print("The recall on test data is ", test_recall_score_rf)
print("The f1 on test data is ", test_f1_score_rf)
print("The roc_score on test data is ", test_roc_score_rf)
The accuracy on test data is 0.8323065949030543 The precision on test data is 0.800129701686122 The recall on test data is 0.8551427779317993 The f1 on test data is 0.8267220584293755 The roc_score on test data is 0.8336881185869549
cm_rf = confusion_matrix(y_train, train_class_preds)
print(cm_rf)
[[15636 17] [ 31 15623]]
labels = ['Not Defaulter', 'Defaulter']
ax= plt.subplot()
sns.heatmap(cm_rf, annot=True, ax = ax) #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
[Text(0, 0.5, 'Not Defaulter'), Text(0, 1.5, 'Defaulter')]
len(optimal_rf_clf.feature_importances_)
82
feature_importances_rf = pd.DataFrame(optimal_rf_clf.feature_importances_,
index = columns,
columns=['importance_rf']).sort_values('importance_rf',
ascending=False)[:10]
plt.subplots(figsize=(17,6))
plt.title("Feature importances")
plt.bar(feature_importances_rf.index, feature_importances_rf['importance_rf'],
color="g", align="center")
plt.xticks(feature_importances_rf.index, rotation = 85)
#plt.xlim([-1, X.shape[1]])
plt.show()
train_class_preds = optimal_rf_clf.predict(X_train)
test_class_preds = optimal_rf_clf.predict(X_test)
y_preds_proba_rf = optimal_rf_clf.predict_proba(X_test)[::,1]
import sklearn.metrics as metrics
y_pred_proba = y_preds_proba_rf
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
def G(v):
bins = np.linspace(0., 100., 11)
total = float(np.sum(v))
yvals = []
for b in bins:
bin_vals = v[v <= np.percentile(v, b)]
bin_fraction = (np.sum(bin_vals) / total) * 100.0
yvals.append(bin_fraction)
# perfect equality area
pe_area = np.trapz(bins, x=bins)
# lorenz area
lorenz_area = np.trapz(yvals, x=bins)
gini_val = (pe_area - lorenz_area) / float(pe_area)
return bins, yvals, gini_val
bins, result, gini_val = G(y_preds_proba_rf)
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(bins, result, label="observed")
plt.plot(bins, bins, '--', label="perfect eq.")
plt.xlabel("fraction of population")
plt.ylabel("fraction of wealth")
plt.title("GINI: %.4f" %(gini_val))
Text(0.5, 1.0, 'GINI: 0.3368')
from sklearn.tree import DecisionTreeClassifier#cartmodel
credit_df_cart=credit_df_copy.copy()
credit_df_cart.head()
| LIMIT_BAL | SEX | AGE | BILL_AMT_SEPT | BILL_AMT_AUG | BILL_AMT_JUL | BILL_AMT_JUN | BILL_AMT_MAY | BILL_AMT_APR | PAY_AMT_SEPT | ... | PAY_APR_-1 | PAY_APR_0 | PAY_APR_1 | PAY_APR_2 | PAY_APR_3 | PAY_APR_4 | PAY_APR_5 | PAY_APR_6 | PAY_APR_7 | PAY_APR_8 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20000 | 0 | 24 | 3913 | 3102 | 689 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 120000 | 0 | 26 | 2682 | 1725 | 2682 | 3272 | 3455 | 3261 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 90000 | 0 | 34 | 29239 | 14027 | 13559 | 14331 | 14948 | 15549 | 1518 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 50000 | 0 | 37 | 46990 | 48233 | 49291 | 28314 | 28959 | 29547 | 2000 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 50000 | 1 | 57 | 8617 | 5670 | 35835 | 20940 | 19146 | 19131 | 2000 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 85 columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)
Confusion Matrix:
[[5598 2113]
[1871 5839]]
Classification Report:
precision recall f1-score support
0 0.75 0.73 0.74 7711
1 0.73 0.76 0.75 7710
accuracy 0.74 15421
macro avg 0.74 0.74 0.74 15421
weighted avg 0.74 0.74 0.74 15421
Accuracy: 0.7416509953958887
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
text_representation = tree.export_text(clf)
print(text_representation)
with open("decistion_tree.log", "w") as fout:
fout.write(text_representation)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clf,feature_names=X.columns,filled=True)
|--- feature_34 <= 0.50 | |--- feature_35 <= 0.50 | | |--- feature_19 <= 0.50 | | | |--- feature_25 <= 0.50 | | | | |--- feature_10 <= 1996.00 | | | | | |--- feature_1 <= 0.50 | | | | | | |--- feature_64 <= 0.50 | | | | | | | |--- feature_55 <= 0.50 | | | | | | | | |--- feature_44 <= 0.50 | | | | | | | | | |--- feature_9 <= 738.50 | | | | | | | | | | |--- feature_11 <= 383.50 | | | | | | | | | | | |--- truncated branch of depth 31 | | | | | | | | | | |--- feature_11 > 383.50 | | | | | | | | | | | |--- truncated branch of depth 15 | | | | | | | | | |--- feature_9 > 738.50 | | | | | | | | | | |--- feature_5 <= 26573.50 | | | | | | | | | | | |--- truncated branch of depth 22 | | | | | | | | | | |--- feature_5 > 26573.50 | | | | | | | | | | | |--- truncated branch of depth 16 | | | | | | | | |--- feature_44 > 0.50 | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_55 > 0.50 | | | | | | | | |--- feature_11 <= 750.00 | | | | | | | | | |--- feature_9 <= 3538.50 | | | | | | | | | | |--- feature_11 <= 163.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_11 > 163.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_9 > 3538.50 | | | | | | | | | | |--- feature_9 <= 6915.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_9 > 6915.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_11 > 750.00 | | | | | | | | | |--- feature_10 <= 1101.00 | | | | | | | | | | |--- feature_2 <= 22.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_2 > 22.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | |--- feature_10 > 1101.00 | | | | | | | | | | |--- class: 1 | | | | | | |--- feature_64 > 0.50 | | | | | | | |--- class: 1 | | | | | |--- feature_1 > 0.50 | | | | | | |--- feature_44 <= 0.50 | | | | | | | |--- feature_74 <= 0.50 | | | | | | | | |--- feature_9 <= 4818.00 | | | | | | | | | |--- feature_54 <= 0.50 | | | | | | | | | | |--- feature_3 <= 16224.00 | | | | | | | | | | | |--- truncated branch of depth 39 | | | | | | | | | | |--- feature_3 > 16224.00 | | | | | | | | | | | |--- truncated branch of depth 17 | | | | | | | | | |--- feature_54 > 0.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_9 > 4818.00 | | | | | | | | | |--- feature_54 <= 0.50 | | | | | | | | | | |--- feature_13 <= 5350.00 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | | |--- feature_13 > 5350.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_54 > 0.50 | | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_74 > 0.50 | | | | | | | | |--- class: 1 | | | | | | |--- feature_44 > 0.50 | | | | | | | |--- class: 1 | | | | |--- feature_10 > 1996.00 | | | | | |--- feature_54 <= 0.50 | | | | | | |--- feature_24 <= 0.50 | | | | | | | |--- feature_74 <= 0.50 | | | | | | | | |--- feature_0 <= 149429.50 | | | | | | | | | |--- feature_0 <= 140058.00 | | | | | | | | | | |--- feature_1 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 27 | | | | | | | | | | |--- feature_1 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 24 | | | | | | | | | |--- feature_0 > 140058.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_0 > 149429.50 | | | | | | | | | |--- feature_4 <= 224882.00 | | | | | | | | | | |--- feature_3 <= 1955.00 | | | | | | | | | | | |--- truncated branch of depth 15 | | | | | | | | | | |--- feature_3 > 1955.00 | | | | | | | | | | | |--- truncated branch of depth 25 | | | | | | | | | |--- feature_4 > 224882.00 | | | | | | | | | | |--- feature_0 <= 396552.50 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | | |--- feature_0 > 396552.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | |--- feature_74 > 0.50 | | | | | | | | |--- class: 1 | | | | | | |--- feature_24 > 0.50 | | | | | | | |--- feature_33 <= 0.50 | | | | | | | | |--- feature_9 <= 1027.00 | | | | | | | | | |--- feature_65 <= 0.50 | | | | | | | | | | |--- feature_12 <= 7195.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_12 > 7195.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_65 > 0.50 | | | | | | | | | | |--- feature_7 <= 843.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_7 > 843.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_9 > 1027.00 | | | | | | | | | |--- feature_3 <= -5523.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_3 > -5523.00 | | | | | | | | | | |--- feature_8 <= -176.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_8 > -176.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | |--- feature_33 > 0.50 | | | | | | | | |--- feature_0 <= 450000.00 | | | | | | | | | |--- feature_11 <= 59101.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_11 > 59101.00 | | | | | | | | | | |--- feature_2 <= 28.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_2 > 28.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_0 > 450000.00 | | | | | | | | | |--- class: 0 | | | | | |--- feature_54 > 0.50 | | | | | | |--- class: 1 | | | |--- feature_25 > 0.50 | | | | |--- feature_42 <= 0.50 | | | | | |--- feature_2 <= 52.50 | | | | | | |--- feature_17 <= 0.50 | | | | | | | |--- feature_3 <= 1114.50 | | | | | | | | |--- feature_10 <= 8.00 | | | | | | | | | |--- feature_46 <= 0.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_46 > 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_10 > 8.00 | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_3 > 1114.50 | | | | | | | | |--- feature_12 <= 59718.50 | | | | | | | | | |--- feature_47 <= 0.50 | | | | | | | | | | |--- feature_4 <= 1719.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_4 > 1719.50 | | | | | | | | | | | |--- truncated branch of depth 16 | | | | | | | | | |--- feature_47 > 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_12 > 59718.50 | | | | | | | | | |--- class: 0 | | | | | | |--- feature_17 > 0.50 | | | | | | | |--- feature_4 <= 45102.00 | | | | | | | | |--- feature_13 <= 1026.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_13 > 1026.50 | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_4 > 45102.00 | | | | | | | | |--- class: 0 | | | | | |--- feature_2 > 52.50 | | | | | | |--- feature_14 <= 318.50 | | | | | | | |--- class: 1 | | | | | | |--- feature_14 > 318.50 | | | | | | | |--- feature_13 <= 2710.00 | | | | | | | | |--- feature_2 <= 61.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_2 > 61.00 | | | | | | | | | |--- feature_7 <= 29543.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_7 > 29543.50 | | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_13 > 2710.00 | | | | | | | | |--- feature_11 <= 5663.00 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_11 > 5663.00 | | | | | | | | | |--- class: 0 | | | | |--- feature_42 > 0.50 | | | | | |--- feature_2 <= 25.50 | | | | | | |--- feature_12 <= 2314.00 | | | | | | | |--- class: 1 | | | | | | |--- feature_12 > 2314.00 | | | | | | | |--- class: 0 | | | | | |--- feature_2 > 25.50 | | | | | | |--- feature_11 <= 5881.00 | | | | | | | |--- class: 0 | | | | | | |--- feature_11 > 5881.00 | | | | | | | |--- feature_0 <= 60000.00 | | | | | | | | |--- class: 1 | | | | | | | |--- feature_0 > 60000.00 | | | | | | | | |--- class: 0 | | |--- feature_19 > 0.50 | | | |--- feature_1 <= 0.50 | | | | |--- feature_9 <= 1999.00 | | | | | |--- feature_16 <= 0.50 | | | | | | |--- feature_0 <= 49049.50 | | | | | | | |--- feature_23 <= 0.50 | | | | | | | | |--- feature_32 <= 0.50 | | | | | | | | | |--- feature_13 <= 1976.50 | | | | | | | | | | |--- feature_3 <= 31317.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- feature_3 > 31317.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- feature_13 > 1976.50 | | | | | | | | | | |--- feature_14 <= 2059.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- feature_14 > 2059.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_32 > 0.50 | | | | | | | | | |--- feature_6 <= 931.50 | | | | | | | | | | |--- feature_6 <= 371.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_6 > 371.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_6 > 931.50 | | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_23 > 0.50 | | | | | | | | |--- feature_3 <= 3716.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_3 > 3716.50 | | | | | | | | | |--- feature_17 <= 0.50 | | | | | | | | | | |--- feature_4 <= 1035.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_4 > 1035.50 | | | | | | | | | | | |--- truncated branch of depth 11 | | | | | | | | | |--- feature_17 > 0.50 | | | | | | | | | | |--- class: 0 | | | | | | |--- feature_0 > 49049.50 | | | | | | | |--- feature_15 <= 0.50 | | | | | | | | |--- feature_0 <= 165730.00 | | | | | | | | | |--- feature_8 <= 2.50 | | | | | | | | | | |--- feature_7 <= -47.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- feature_7 > -47.50 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | |--- feature_8 > 2.50 | | | | | | | | | | |--- feature_11 <= 978.00 | | | | | | | | | | | |--- truncated branch of depth 14 | | | | | | | | | | |--- feature_11 > 978.00 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | |--- feature_0 > 165730.00 | | | | | | | | | |--- feature_6 <= 1424.00 | | | | | | | | | | |--- feature_3 <= 2461.50 | | | | | | | | | | | |--- truncated branch of depth 22 | | | | | | | | | | |--- feature_3 > 2461.50 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | | |--- feature_6 > 1424.00 | | | | | | | | | | |--- feature_7 <= 171109.00 | | | | | | | | | | | |--- truncated branch of depth 14 | | | | | | | | | | |--- feature_7 > 171109.00 | | | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_15 > 0.50 | | | | | | | | |--- feature_23 <= 0.50 | | | | | | | | | |--- feature_33 <= 0.50 | | | | | | | | | | |--- feature_14 <= 1900.00 | | | | | | | | | | | |--- truncated branch of depth 24 | | | | | | | | | | |--- feature_14 > 1900.00 | | | | | | | | | | | |--- truncated branch of depth 11 | | | | | | | | | |--- feature_33 > 0.50 | | | | | | | | | | |--- feature_3 <= 7312.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_3 > 7312.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | |--- feature_23 > 0.50 | | | | | | | | | |--- feature_3 <= 6674.50 | | | | | | | | | | |--- feature_4 <= 4994.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- feature_4 > 4994.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- feature_3 > 6674.50 | | | | | | | | | | |--- feature_0 <= 96767.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- feature_0 > 96767.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | |--- feature_16 > 0.50 | | | | | | |--- feature_25 <= 0.50 | | | | | | | |--- feature_11 <= 511.00 | | | | | | | | |--- feature_6 <= 2.50 | | | | | | | | | |--- feature_3 <= 1553.50 | | | | | | | | | | |--- feature_0 <= 25000.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_0 > 25000.00 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | |--- feature_3 > 1553.50 | | | | | | | | | | |--- feature_4 <= 6468.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_4 > 6468.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_6 > 2.50 | | | | | | | | | |--- feature_6 <= 323.00 | | | | | | | | | | |--- feature_12 <= 235.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_12 > 235.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_6 > 323.00 | | | | | | | | | | |--- feature_0 <= 115000.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_0 > 115000.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | |--- feature_11 > 511.00 | | | | | | | | |--- feature_44 <= 0.50 | | | | | | | | | |--- feature_4 <= 69088.50 | | | | | | | | | | |--- feature_62 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 14 | | | | | | | | | | |--- feature_62 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | |--- feature_4 > 69088.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_44 > 0.50 | | | | | | | | | |--- class: 1 | | | | | | |--- feature_25 > 0.50 | | | | | | | |--- feature_7 <= 7221.00 | | | | | | | | |--- class: 0 | | | | | | | |--- feature_7 > 7221.00 | | | | | | | | |--- feature_2 <= 22.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_2 > 22.50 | | | | | | | | | |--- feature_10 <= 1900.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_10 > 1900.00 | | | | | | | | | | |--- feature_7 <= 27947.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_7 > 27947.00 | | | | | | | | | | | |--- class: 0 | | | | |--- feature_9 > 1999.00 | | | | | |--- feature_25 <= 0.50 | | | | | | |--- feature_44 <= 0.50 | | | | | | | |--- feature_2 <= 33.50 | | | | | | | | |--- feature_15 <= 0.50 | | | | | | | | | |--- feature_0 <= 78752.50 | | | | | | | | | | |--- feature_4 <= 50365.50 | | | | | | | | | | | |--- truncated branch of depth 13 | | | | | | | | | | |--- feature_4 > 50365.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | |--- feature_0 > 78752.50 | | | | | | | | | | |--- feature_54 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 18 | | | | | | | | | | |--- feature_54 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_15 > 0.50 | | | | | | | | | |--- feature_0 <= 177750.50 | | | | | | | | | | |--- feature_10 <= 2998.00 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- feature_10 > 2998.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | |--- feature_0 > 177750.50 | | | | | | | | | | |--- feature_6 <= 180991.00 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- feature_6 > 180991.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | |--- feature_2 > 33.50 | | | | | | | | |--- feature_74 <= 0.50 | | | | | | | | | |--- feature_11 <= 4985.00 | | | | | | | | | | |--- feature_11 <= 4822.50 | | | | | | | | | | | |--- truncated branch of depth 23 | | | | | | | | | | |--- feature_11 > 4822.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_11 > 4985.00 | | | | | | | | | | |--- feature_4 <= 200917.50 | | | | | | | | | | | |--- truncated branch of depth 19 | | | | | | | | | | |--- feature_4 > 200917.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | |--- feature_74 > 0.50 | | | | | | | | | |--- class: 1 | | | | | | |--- feature_44 > 0.50 | | | | | | | |--- class: 1 | | | | | |--- feature_25 > 0.50 | | | | | | |--- feature_13 <= 15318.50 | | | | | | | |--- feature_3 <= 919.50 | | | | | | | | |--- feature_0 <= 200000.00 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_0 > 200000.00 | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_3 > 919.50 | | | | | | | | |--- feature_9 <= 2103.50 | | | | | | | | | |--- feature_5 <= 44253.00 | | | | | | | | | | |--- feature_12 <= 2500.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_12 > 2500.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_5 > 44253.00 | | | | | | | | | | |--- feature_14 <= 1722.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_14 > 1722.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_9 > 2103.50 | | | | | | | | | |--- feature_3 <= 67640.50 | | | | | | | | | | |--- feature_13 <= 4414.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_13 > 4414.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- feature_3 > 67640.50 | | | | | | | | | | |--- feature_16 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- feature_16 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | |--- feature_13 > 15318.50 | | | | | | | |--- class: 0 | | | |--- feature_1 > 0.50 | | | | |--- feature_10 <= 3998.00 | | | | | |--- feature_16 <= 0.50 | | | | | | |--- feature_0 <= 199257.00 | | | | | | | |--- feature_2 <= 40.50 | | | | | | | | |--- feature_15 <= 0.50 | | | | | | | | | |--- feature_0 <= 109709.50 | | | | | | | | | | |--- feature_5 <= 3.00 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- feature_5 > 3.00 | | | | | | | | | | | |--- truncated branch of depth 16 | | | | | | | | | |--- feature_0 > 109709.50 | | | | | | | | | | |--- feature_11 <= 2821.50 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | | |--- feature_11 > 2821.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | |--- feature_15 > 0.50 | | | | | | | | | |--- feature_12 <= 9972.00 | | | | | | | | | | |--- feature_8 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 14 | | | | | | | | | | |--- feature_8 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | |--- feature_12 > 9972.00 | | | | | | | | | | |--- feature_8 <= 78497.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_8 > 78497.50 | | | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_2 > 40.50 | | | | | | | | |--- feature_24 <= 0.50 | | | | | | | | | |--- feature_54 <= 0.50 | | | | | | | | | | |--- feature_2 <= 56.50 | | | | | | | | | | | |--- truncated branch of depth 17 | | | | | | | | | | |--- feature_2 > 56.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | |--- feature_54 > 0.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_24 > 0.50 | | | | | | | | | |--- feature_3 <= 1.50 | | | | | | | | | | |--- feature_2 <= 48.50 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | | | |--- feature_2 > 48.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_3 > 1.50 | | | | | | | | | | |--- class: 1 | | | | | | |--- feature_0 > 199257.00 | | | | | | | |--- feature_9 <= 998.50 | | | | | | | | |--- feature_23 <= 0.50 | | | | | | | | | |--- feature_13 <= 388.50 | | | | | | | | | | |--- feature_8 <= 1.00 | | | | | | | | | | | |--- truncated branch of depth 15 | | | | | | | | | | |--- feature_8 > 1.00 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | |--- feature_13 > 388.50 | | | | | | | | | | |--- feature_53 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | | |--- feature_53 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | |--- feature_23 > 0.50 | | | | | | | | | |--- feature_3 <= 11149.50 | | | | | | | | | | |--- feature_17 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- feature_17 > 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_3 > 11149.50 | | | | | | | | | | |--- feature_6 <= 37.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_6 > 37.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | |--- feature_9 > 998.50 | | | | | | | | |--- feature_64 <= 0.50 | | | | | | | | | |--- feature_3 <= 914.00 | | | | | | | | | | |--- feature_23 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_23 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_3 > 914.00 | | | | | | | | | | |--- feature_11 <= 3137.00 | | | | | | | | | | | |--- truncated branch of depth 16 | | | | | | | | | | |--- feature_11 > 3137.00 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | |--- feature_64 > 0.50 | | | | | | | | | |--- class: 1 | | | | | |--- feature_16 > 0.50 | | | | | | |--- feature_0 <= 109003.00 | | | | | | | |--- feature_2 <= 41.50 | | | | | | | | |--- feature_8 <= 50033.50 | | | | | | | | | |--- feature_4 <= 39109.50 | | | | | | | | | | |--- feature_9 <= 1211.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- feature_9 > 1211.00 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | |--- feature_4 > 39109.50 | | | | | | | | | | |--- feature_7 <= 20061.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_7 > 20061.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | |--- feature_8 > 50033.50 | | | | | | | | | |--- feature_33 <= 0.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_33 > 0.50 | | | | | | | | | | |--- feature_5 <= 75784.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_5 > 75784.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | |--- feature_2 > 41.50 | | | | | | | | |--- feature_64 <= 0.50 | | | | | | | | | |--- feature_9 <= 1168.00 | | | | | | | | | | |--- feature_0 <= 65000.00 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | | |--- feature_0 > 65000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_9 > 1168.00 | | | | | | | | | | |--- feature_46 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | | |--- feature_46 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_64 > 0.50 | | | | | | | | | |--- class: 1 | | | | | | |--- feature_0 > 109003.00 | | | | | | | |--- feature_25 <= 0.50 | | | | | | | | |--- feature_11 <= 567.50 | | | | | | | | | |--- feature_6 <= 16.50 | | | | | | | | | | |--- feature_2 <= 60.00 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- feature_2 > 60.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- feature_6 > 16.50 | | | | | | | | | | |--- feature_0 <= 124349.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_0 > 124349.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | |--- feature_11 > 567.50 | | | | | | | | | |--- feature_5 <= 92460.00 | | | | | | | | | | |--- feature_14 <= 12196.00 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- feature_14 > 12196.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_5 > 92460.00 | | | | | | | | | | |--- feature_3 <= 95705.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_3 > 95705.00 | | | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_25 > 0.50 | | | | | | | | |--- feature_6 <= 506.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_6 > 506.00 | | | | | | | | | |--- class: 1 | | | | |--- feature_10 > 3998.00 | | | | | |--- feature_33 <= 0.50 | | | | | | |--- feature_9 <= 7194.00 | | | | | | | |--- feature_23 <= 0.50 | | | | | | | | |--- feature_4 <= 7156.50 | | | | | | | | | |--- feature_0 <= 279610.50 | | | | | | | | | | |--- feature_14 <= 3843.00 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- feature_14 > 3843.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | |--- feature_0 > 279610.50 | | | | | | | | | | |--- feature_24 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | | |--- feature_24 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_4 > 7156.50 | | | | | | | | | |--- feature_12 <= 28.00 | | | | | | | | | | |--- feature_13 <= 2921.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_13 > 2921.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_12 > 28.00 | | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_23 > 0.50 | | | | | | | | |--- feature_6 <= 8506.50 | | | | | | | | | |--- feature_2 <= 59.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_2 > 59.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_6 > 8506.50 | | | | | | | | | |--- feature_14 <= 4254.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_14 > 4254.50 | | | | | | | | | | |--- feature_9 <= 4447.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_9 > 4447.50 | | | | | | | | | | | |--- class: 0 | | | | | | |--- feature_9 > 7194.00 | | | | | | | |--- feature_7 <= 127951.00 | | | | | | | | |--- feature_2 <= 27.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_2 > 27.50 | | | | | | | | | |--- feature_11 <= 5993.50 | | | | | | | | | | |--- feature_5 <= 15511.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- feature_5 > 15511.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_11 > 5993.50 | | | | | | | | | | |--- feature_64 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- feature_64 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_7 > 127951.00 | | | | | | | | |--- feature_10 <= 94158.00 | | | | | | | | | |--- feature_0 <= 170000.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_0 > 170000.00 | | | | | | | | | | |--- feature_0 <= 400000.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_0 > 400000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_10 > 94158.00 | | | | | | | | | |--- class: 0 | | | | | |--- feature_33 > 0.50 | | | | | | |--- feature_23 <= 0.50 | | | | | | | |--- feature_22 <= 0.50 | | | | | | | | |--- feature_2 <= 56.50 | | | | | | | | | |--- feature_8 <= 52.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_8 > 52.50 | | | | | | | | | | |--- feature_13 <= 39.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_13 > 39.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | |--- feature_2 > 56.50 | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_22 > 0.50 | | | | | | | | |--- feature_4 <= 8520.50 | | | | | | | | | |--- feature_0 <= 80000.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_0 > 80000.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_4 > 8520.50 | | | | | | | | | |--- feature_7 <= 144459.50 | | | | | | | | | | |--- feature_3 <= 17356.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- feature_3 > 17356.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_7 > 144459.50 | | | | | | | | | | |--- class: 1 | | | | | | |--- feature_23 > 0.50 | | | | | | | |--- feature_16 <= 0.50 | | | | | | | | |--- feature_0 <= 169820.50 | | | | | | | | | |--- feature_12 <= 4971.00 | | | | | | | | | | |--- feature_10 <= 4000.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_10 > 4000.50 | | | | | | | | | | | |--- truncated branch of depth 15 | | | | | | | | | |--- feature_12 > 4971.00 | | | | | | | | | | |--- feature_12 <= 5229.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- feature_12 > 5229.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | |--- feature_0 > 169820.50 | | | | | | | | | |--- feature_2 <= 45.50 | | | | | | | | | | |--- feature_8 <= 157315.50 | | | | | | | | | | | |--- truncated branch of depth 13 | | | | | | | | | | |--- feature_8 > 157315.50 | | | | | | | | | | | |--- truncated branch of depth 13 | | | | | | | | | |--- feature_2 > 45.50 | | | | | | | | | | |--- feature_12 <= 7148.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_12 > 7148.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | |--- feature_16 > 0.50 | | | | | | | | |--- feature_5 <= 15408.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_5 > 15408.50 | | | | | | | | | |--- feature_9 <= 1870.00 | | | | | | | | | | |--- feature_53 <= 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_53 > 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_9 > 1870.00 | | | | | | | | | | |--- feature_8 <= 194588.50 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | | | |--- feature_8 > 194588.50 | | | | | | | | | | | |--- truncated branch of depth 4 | |--- feature_35 > 0.50 | | |--- feature_42 <= 0.50 | | | |--- feature_24 <= 0.50 | | | | |--- feature_22 <= 0.50 | | | | | |--- feature_64 <= 0.50 | | | | | | |--- feature_19 <= 0.50 | | | | | | | |--- feature_15 <= 0.50 | | | | | | | | |--- feature_73 <= 0.50 | | | | | | | | | |--- feature_14 <= 5062.00 | | | | | | | | | | |--- feature_11 <= 16167.00 | | | | | | | | | | | |--- truncated branch of depth 13 | | | | | | | | | | |--- feature_11 > 16167.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_14 > 5062.00 | | | | | | | | | | |--- feature_10 <= 6550.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- feature_10 > 6550.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_73 > 0.50 | | | | | | | | | |--- feature_13 <= 2017.00 | | | | | | | | | | |--- feature_5 <= 29742.50 | | | | | | | | | | | |--- truncated branch of depth 11 | | | | | | | | | | |--- feature_5 > 29742.50 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | | |--- feature_13 > 2017.00 | | | | | | | | | | |--- feature_0 <= 225000.00 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- feature_0 > 225000.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | |--- feature_15 > 0.50 | | | | | | | | |--- feature_4 <= 93.50 | | | | | | | | | |--- feature_13 <= 3033.50 | | | | | | | | | | |--- feature_0 <= 430000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_0 > 430000.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_13 > 3033.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_4 > 93.50 | | | | | | | | | |--- feature_0 <= 370000.00 | | | | | | | | | | |--- feature_2 <= 66.00 | | | | | | | | | | | |--- truncated branch of depth 13 | | | | | | | | | | |--- feature_2 > 66.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_0 > 370000.00 | | | | | | | | | | |--- class: 0 | | | | | | |--- feature_19 > 0.50 | | | | | | | |--- feature_16 <= 0.50 | | | | | | | | |--- feature_5 <= 158.00 | | | | | | | | | |--- feature_0 <= 175000.00 | | | | | | | | | | |--- feature_1 <= 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_1 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- feature_0 > 175000.00 | | | | | | | | | | |--- feature_8 <= 88.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_8 > 88.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_5 > 158.00 | | | | | | | | | |--- feature_13 <= 28000.00 | | | | | | | | | | |--- feature_9 <= 4894.00 | | | | | | | | | | | |--- truncated branch of depth 14 | | | | | | | | | | |--- feature_9 > 4894.00 | | | | | | | | | | | |--- truncated branch of depth 12 | | | | | | | | | |--- feature_13 > 28000.00 | | | | | | | | | | |--- feature_14 <= 378.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_14 > 378.50 | | | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_16 > 0.50 | | | | | | | | |--- feature_0 <= 45000.00 | | | | | | | | | |--- feature_9 <= 3490.00 | | | | | | | | | | |--- feature_10 <= 4667.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- feature_10 > 4667.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_9 > 3490.00 | | | | | | | | | | |--- feature_11 <= 1650.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_11 > 1650.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_0 > 45000.00 | | | | | | | | | |--- feature_6 <= 46058.00 | | | | | | | | | | |--- feature_0 <= 115000.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_0 > 115000.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_6 > 46058.00 | | | | | | | | | | |--- feature_6 <= 135721.50 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | | | |--- feature_6 > 135721.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | |--- feature_64 > 0.50 | | | | | | |--- class: 1 | | | | |--- feature_22 > 0.50 | | | | | |--- feature_45 <= 0.50 | | | | | | |--- feature_10 <= 3679.50 | | | | | | | |--- feature_6 <= 9332.50 | | | | | | | | |--- feature_11 <= 1665.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_11 > 1665.50 | | | | | | | | | |--- feature_6 <= 2679.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_6 > 2679.50 | | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_6 > 9332.50 | | | | | | | | |--- feature_4 <= 8840.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_4 > 8840.50 | | | | | | | | | |--- feature_13 <= 500.00 | | | | | | | | | | |--- feature_11 <= 811.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_11 > 811.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_13 > 500.00 | | | | | | | | | | |--- class: 1 | | | | | | |--- feature_10 > 3679.50 | | | | | | | |--- feature_52 <= 0.50 | | | | | | | | |--- class: 0 | | | | | | | |--- feature_52 > 0.50 | | | | | | | | |--- class: 1 | | | | | |--- feature_45 > 0.50 | | | | | | |--- feature_0 <= 125000.00 | | | | | | | |--- feature_1 <= 0.50 | | | | | | | | |--- feature_13 <= 458.00 | | | | | | | | | |--- feature_2 <= 30.50 | | | | | | | | | | |--- feature_18 <= 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_18 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_2 > 30.50 | | | | | | | | | | |--- feature_9 <= 675.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_9 > 675.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_13 > 458.00 | | | | | | | | | |--- feature_6 <= 250.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_6 > 250.50 | | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_1 > 0.50 | | | | | | | | |--- feature_5 <= 13012.00 | | | | | | | | | |--- feature_4 <= 1228.00 | | | | | | | | | | |--- feature_9 <= 100.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_9 > 100.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_4 > 1228.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_5 > 13012.00 | | | | | | | | | |--- feature_15 <= 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_15 > 0.50 | | | | | | | | | | |--- class: 1 | | | | | | |--- feature_0 > 125000.00 | | | | | | | |--- feature_12 <= 1952.00 | | | | | | | | |--- feature_8 <= -838.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_8 > -838.50 | | | | | | | | | |--- feature_52 <= 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_52 > 0.50 | | | | | | | | | | |--- feature_0 <= 185000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_0 > 185000.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | |--- feature_12 > 1952.00 | | | | | | | | |--- feature_14 <= 4311.50 | | | | | | | | | |--- feature_4 <= 1184.00 | | | | | | | | | | |--- feature_14 <= 660.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_14 > 660.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_4 > 1184.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_14 > 4311.50 | | | | | | | | | |--- class: 0 | | | |--- feature_24 > 0.50 | | | | |--- feature_74 <= 0.50 | | | | | |--- feature_9 <= 67.50 | | | | | | |--- feature_0 <= 45000.00 | | | | | | | |--- feature_10 <= 986.50 | | | | | | | | |--- feature_8 <= 141.00 | | | | | | | | | |--- feature_3 <= 4188.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_3 > 4188.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_8 > 141.00 | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_10 > 986.50 | | | | | | | | |--- feature_8 <= 39300.50 | | | | | | | | | |--- feature_8 <= 29967.50 | | | | | | | | | | |--- feature_7 <= 28919.00 | | | | | | | | | | | |--- truncated branch of depth 11 | | | | | | | | | | |--- feature_7 > 28919.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_8 > 29967.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_8 > 39300.50 | | | | | | | | | |--- class: 0 | | | | | | |--- feature_0 > 45000.00 | | | | | | | |--- feature_54 <= 0.50 | | | | | | | | |--- feature_75 <= 0.50 | | | | | | | | | |--- feature_19 <= 0.50 | | | | | | | | | | |--- feature_11 <= 3800.00 | | | | | | | | | | | |--- truncated branch of depth 11 | | | | | | | | | | |--- feature_11 > 3800.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | |--- feature_19 > 0.50 | | | | | | | | | | |--- feature_13 <= 2550.00 | | | | | | | | | | | |--- truncated branch of depth 14 | | | | | | | | | | |--- feature_13 > 2550.00 | | | | | | | | | | | |--- truncated branch of depth 11 | | | | | | | | |--- feature_75 > 0.50 | | | | | | | | | |--- feature_12 <= 1900.00 | | | | | | | | | | |--- feature_10 <= 5512.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- feature_10 > 5512.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_12 > 1900.00 | | | | | | | | | | |--- feature_2 <= 26.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_2 > 26.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | |--- feature_54 > 0.50 | | | | | | | | |--- class: 1 | | | | | |--- feature_9 > 67.50 | | | | | | |--- feature_10 <= 31.00 | | | | | | | |--- feature_12 <= 2401.50 | | | | | | | | |--- feature_0 <= 55000.00 | | | | | | | | | |--- feature_72 <= 0.50 | | | | | | | | | | |--- feature_9 <= 1012.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_9 > 1012.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | |--- feature_72 > 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_0 > 55000.00 | | | | | | | | | |--- feature_8 <= 19764.50 | | | | | | | | | | |--- feature_6 <= 3214.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_6 > 3214.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_8 > 19764.50 | | | | | | | | | | |--- feature_12 <= 1977.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_12 > 1977.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | |--- feature_12 > 2401.50 | | | | | | | | |--- feature_3 <= 52156.50 | | | | | | | | | |--- feature_3 <= 4249.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- feature_3 > 4249.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- feature_3 > 52156.50 | | | | | | | | | |--- feature_13 <= 3554.00 | | | | | | | | | | |--- feature_3 <= 129362.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- feature_3 > 129362.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- feature_13 > 3554.00 | | | | | | | | | | |--- feature_11 <= 11009.00 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- feature_11 > 11009.00 | | | | | | | | | | | |--- class: 1 | | | | | | |--- feature_10 > 31.00 | | | | | | | |--- feature_19 <= 0.50 | | | | | | | | |--- feature_10 <= 3550.00 | | | | | | | | | |--- feature_9 <= 995.50 | | | | | | | | | | |--- feature_72 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- feature_72 > 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_9 > 995.50 | | | | | | | | | | |--- feature_8 <= 32427.00 | | | | | | | | | | | |--- truncated branch of depth 10 | | | | | | | | | | |--- feature_8 > 32427.00 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | |--- feature_10 > 3550.00 | | | | | | | | | |--- feature_14 <= 5016.50 | | | | | | | | | | |--- feature_4 <= 90814.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- feature_4 > 90814.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | |--- feature_14 > 5016.50 | | | | | | | | | | |--- feature_11 <= 9318.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_11 > 9318.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | |--- feature_19 > 0.50 | | | | | | | | |--- feature_16 <= 0.50 | | | | | | | | | |--- feature_3 <= 238774.50 | | | | | | | | | | |--- feature_11 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- feature_11 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 13 | | | | | | | | | |--- feature_3 > 238774.50 | | | | | | | | | | |--- feature_0 <= 336272.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_0 > 336272.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_16 > 0.50 | | | | | | | | | |--- feature_14 <= 1932.50 | | | | | | | | | | |--- feature_14 <= 615.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- feature_14 > 615.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_14 > 1932.50 | | | | | | | | | | |--- feature_14 <= 4072.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_14 > 4072.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | |--- feature_74 > 0.50 | | | | | |--- class: 1 | | |--- feature_42 > 0.50 | | | |--- feature_5 <= 329.50 | | | | |--- feature_2 <= 46.00 | | | | | |--- feature_0 <= 325000.00 | | | | | | |--- feature_73 <= 0.50 | | | | | | | |--- feature_7 <= 81.50 | | | | | | | | |--- feature_0 <= 140000.00 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_0 > 140000.00 | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_7 > 81.50 | | | | | | | | |--- class: 1 | | | | | | |--- feature_73 > 0.50 | | | | | | | |--- feature_5 <= 252.00 | | | | | | | | |--- class: 0 | | | | | | | |--- feature_5 > 252.00 | | | | | | | | |--- feature_15 <= 0.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_15 > 0.50 | | | | | | | | | |--- class: 0 | | | | | |--- feature_0 > 325000.00 | | | | | | |--- class: 0 | | | | |--- feature_2 > 46.00 | | | | | |--- class: 0 | | | |--- feature_5 > 329.50 | | | | |--- feature_11 <= 3579.50 | | | | | |--- feature_23 <= 0.50 | | | | | | |--- feature_7 <= 713.00 | | | | | | | |--- feature_10 <= 2222.50 | | | | | | | | |--- feature_10 <= 1900.00 | | | | | | | | | |--- feature_3 <= 14331.50 | | | | | | | | | | |--- feature_0 <= 35000.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- feature_0 > 35000.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- feature_3 > 14331.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_10 > 1900.00 | | | | | | | | | |--- class: 1 | | | | | | | |--- feature_10 > 2222.50 | | | | | | | | |--- class: 0 | | | | | | |--- feature_7 > 713.00 | | | | | | | |--- feature_7 <= 904.00 | | | | | | | | |--- feature_4 <= 890.00 | | | | | | | | | |--- feature_0 <= 40000.00 | | | | | | | | | | |--- feature_0 <= 15000.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- feature_0 > 15000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_0 > 40000.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- feature_4 > 890.00 | | | | | | | | | |--- class: 0 | | | | | | | |--- feature_7 > 904.00 | | | | | | | | |--- feature_9 <= 613.00 | | | | | | | | | |--- feature_0 <= 205000.00 | | | | | | | | | | |--- feature_5 <= 1348.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- feature_5 > 1348.00 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | |--- feature_0 > 205000.00 | | | | | | | | | | |--- feature_12 <= 5852.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- feature_12 > 5852.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- feature_9 > 613.00 | | | | | | | | | |--- feature_2 <= 24.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- feature_2 > 24.50 | | | | | | | | | | |--- class: 1 | | | | | |--- feature_23 > 0.50 | | | | | | |--- class: 1 | | | | |--- feature_11 > 3579.50 | | | | | |--- feature_16 <= 0.50 | | | | | | |--- class: 0 | | | | | |--- feature_16 > 0.50 | | | | | | |--- feature_3 <= 3299.50 | | | | | | | |--- class: 1 | | | | | | |--- feature_3 > 3299.50 | | | | | | | |--- class: 0 |--- feature_34 > 0.50 | |--- feature_4 <= -938.00 | | |--- feature_73 <= 0.50 | | | |--- feature_14 <= 2610.00 | | | | |--- class: 1 | | | |--- feature_14 > 2610.00 | | | | |--- class: 0 | | |--- feature_73 > 0.50 | | | |--- class: 0 | |--- feature_4 > -938.00 | | |--- feature_10 <= 157632.00 | | | |--- feature_4 <= 0.50 | | | | |--- feature_9 <= 1753.50 | | | | | |--- class: 1 | | | | |--- feature_9 > 1753.50 | | | | | |--- class: 0 | | | |--- feature_4 > 0.50 | | | | |--- feature_9 <= 19872.00 | | | | | |--- feature_0 <= 399282.00 | | | | | | |--- class: 1 | | | | | |--- feature_0 > 399282.00 | | | | | | |--- feature_11 <= 32.50 | | | | | | | |--- feature_62 <= 0.50 | | | | | | | | |--- class: 0 | | | | | | | |--- feature_62 > 0.50 | | | | | | | | |--- class: 1 | | | | | | |--- feature_11 > 32.50 | | | | | | | |--- class: 1 | | | | |--- feature_9 > 19872.00 | | | | | |--- feature_13 <= 9198.00 | | | | | | |--- class: 1 | | | | | |--- feature_13 > 9198.00 | | | | | | |--- feature_6 <= 441600.00 | | | | | | | |--- class: 0 | | | | | | |--- feature_6 > 441600.00 | | | | | | | |--- class: 1 | | |--- feature_10 > 157632.00 | | | |--- class: 0
train_preds = optimized_clf.predict_proba(X_train)[:,1]
test_preds = optimized_clf.predict_proba(X_test)[:,1]
train_class_preds = optimized_clf.predict(X_train)
test_class_preds = optimized_clf.predict(X_test)
# Get the accuracy scores
train_accuracy_ = accuracy_score(train_class_preds,y_train)
test_accuracy_lr = accuracy_score(test_class_preds,y_test)
print("The accuracy on train data is ", train_accuracy_lr)
print("The accuracy on test data is ", test_accuracy_lr)
The accuracy on train data is 0.7538250231577602 The accuracy on test data is 0.49957849685493805
#print the accuracy,precission,recall,f1,roc_score
test_accuracy_lr= accuracy_score(test_class_preds,y_test)
test_precision_score_lr = precision_score(test_class_preds,y_test)
test_recall_score_lr = recall_score(test_class_preds,y_test)
test_f1_score_lr = f1_score(test_class_preds,y_test)
test_roc_score_lr = roc_auc_score(test_class_preds,y_test)
print("The accuracy on test data is ", test_accuracy_lr)
print("The precision on test data is ", test_precision_score_lr)
print("The recall on test data is ", test_recall_score_lr)
print("The f1 on test data is ", test_f1_score_lr)
print("The roc_score on test data is ", test_roc_score_lr)
The accuracy on test data is 0.49957849685493805 The precision on test data is 0.0006485084306095979 The recall on test data is 0.29411764705882354 The f1 on test data is 0.0012941633234114145 The roc_score on test data is 0.3969614462248156
cm_lr = confusion_matrix(y_train, train_class_preds)
print(cm_lr)
[[15636 17] [15646 8]]
labels = ['Not Defaulter', 'Defaulter']
ax= plt.subplot()
sns.heatmap(cm_lr, annot=True, ax = ax) #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
[Text(0, 0.5, 'Not Defaulter'), Text(0, 1.5, 'Defaulter')]
len(optimal_rf_clf.feature_importances_)
82
train_class_preds = optimal_rf_clf.predict(X_train)
test_class_preds = optimal_rf_clf.predict(X_test)
y_preds_proba_rf = optimal_rf_clf.predict_proba(X_test)[::,1]
import sklearn.metrics as metrics
y_pred_proba = y_preds_proba_rf
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
def G(v):
bins = np.linspace(0., 100., 11)
total = float(np.sum(v))
yvals = []
for b in bins:
bin_vals = v[v <= np.percentile(v, b)]
bin_fraction = (np.sum(bin_vals) / total) * 100.0
yvals.append(bin_fraction)
# perfect equality area
pe_area = np.trapz(bins, x=bins)
# lorenz area
lorenz_area = np.trapz(yvals, x=bins)
gini_val = (pe_area - lorenz_area) / float(pe_area)
return bins, yvals, gini_val
bins, result, gini_val = G(y_preds_proba_rf)
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(bins, result, label="observed")
plt.plot(bins, bins, '--', label="perfect eq.")
plt.xlabel("fraction of population")
plt.ylabel("fraction of wealth")
plt.title("GINI: %.4f" %(gini_val))
Text(0.5, 1.0, 'GINI: 0.1057')
from sklearn.neighbors import KNeighborsClassifier
credit_df_knn=credit_df_copy.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=8)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)
Confusion Matrix:
[[6373 1338]
[2328 5382]]
Classification Report:
precision recall f1-score support
0 0.73 0.83 0.78 7711
1 0.80 0.70 0.75 7710
accuracy 0.76 15421
macro avg 0.77 0.76 0.76 15421
weighted avg 0.77 0.76 0.76 15421
Accuracy: 0.7622722261850723
train_preds = optimized_clf.predict_proba(X_train)[:,1]
test_preds = optimized_clf.predict_proba(X_test)[:,1]
train_class_preds = optimized_clf.predict(X_train)
test_class_preds = optimized_clf.predict(X_test)
# Get the accuracy scores
train_accuracy_ = accuracy_score(train_class_preds,y_train)
test_accuracy_lr = accuracy_score(test_class_preds,y_test)
print("The accuracy on train data is ", train_accuracy_lr)
print("The accuracy on test data is ", test_accuracy_lr)
The accuracy on train data is 0.7538250231577602 The accuracy on test data is 0.7522858439789897
#print the accuracy,precission,recall,f1,roc_score
test_accuracy_lr= accuracy_score(test_class_preds,y_test)
test_precision_score_lr = precision_score(test_class_preds,y_test)
test_recall_score_lr = recall_score(test_class_preds,y_test)
test_f1_score_lr = f1_score(test_class_preds,y_test)
test_roc_score_lr = roc_auc_score(test_class_preds,y_test)
print("The accuracy on test data is ", test_accuracy_lr)
print("The precision on test data is ", test_precision_score_lr)
print("The recall on test data is ", test_recall_score_lr)
print("The f1 on test data is ", test_f1_score_lr)
print("The roc_score on test data is ", test_roc_score_lr)
The accuracy on test data is 0.7522858439789897 The precision on test data is 0.6888456549935149 The recall on test data is 0.7889185977421271 The f1 on test data is 0.7354936989336657 The roc_score on test data is 0.7564111920693602
cm_lr = confusion_matrix(y_train, train_class_preds)
print(cm_lr)
[[12793 2860] [ 4852 10802]]
labels = ['Not Defaulter', 'Defaulter']
ax= plt.subplot()
sns.heatmap(cm_lr, annot=True, ax = ax) #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
[Text(0, 0.5, 'Not Defaulter'), Text(0, 1.5, 'Defaulter')]
len(optimal_rf_clf.feature_importances_)
82
train_class_preds = optimal_rf_clf.predict(X_train)
test_class_preds = optimal_rf_clf.predict(X_test)
y_preds_proba_rf = optimal_rf_clf.predict_proba(X_test)[::,1]
def G(v):
bins = np.linspace(0., 100., 11)
total = float(np.sum(v))
yvals = []
for b in bins:
bin_vals = v[v <= np.percentile(v, b)]
bin_fraction = (np.sum(bin_vals) / total) * 100.0
yvals.append(bin_fraction)
# perfect equality area
pe_area = np.trapz(bins, x=bins)
# lorenz area
lorenz_area = np.trapz(yvals, x=bins)
gini_val = (pe_area - lorenz_area) / float(pe_area)
return bins, yvals, gini_val
bins, result, gini_val = G(y_preds_proba_rf)
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(bins, result, label="observed")
plt.plot(bins, bins, '--', label="perfect eq.")
plt.xlabel("fraction of population")
plt.ylabel("fraction of wealth")
plt.title("GINI: %.4f" %(gini_val))
Text(0.5, 1.0, 'GINI: 0.3130')
import sklearn.metrics as metrics
y_pred_proba = y_preds_proba_rf
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)
# Initialize Gradient Boosting classifier
gb = GradientBoostingClassifier()
# Train the model on the training set
gb.fit(X_train, y_train)
# Predict on the test set
y_pred = gb.predict(X_test)
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
Accuracy: 0.7786784255236366
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)
Confusion Matrix:
[[6367 1344]
[2069 5641]]
Classification Report:
precision recall f1-score support
0 0.75 0.83 0.79 7711
1 0.81 0.73 0.77 7710
accuracy 0.78 15421
macro avg 0.78 0.78 0.78 15421
weighted avg 0.78 0.78 0.78 15421
Accuracy: 0.7786784255236366
train_preds = optimized_clf.predict_proba(X_train)[:,1]
test_preds = optimized_clf.predict_proba(X_test)[:,1]
train_class_preds = optimized_clf.predict(X_train)
test_class_preds = optimized_clf.predict(X_test)
# Get the accuracy scores
train_accuracy_ = accuracy_score(train_class_preds,y_train)
test_accuracy_lr = accuracy_score(test_class_preds,y_test)
print("The accuracy on train data is ", train_accuracy_lr)
print("The accuracy on test data is ", test_accuracy_lr)
The accuracy on train data is 0.7538250231577602 The accuracy on test data is 0.49957849685493805
#print the accuracy,precission,recall,f1,roc_score
test_accuracy_lr= accuracy_score(test_class_preds,y_test)
test_precision_score_lr = precision_score(test_class_preds,y_test)
test_recall_score_lr = recall_score(test_class_preds,y_test)
test_f1_score_lr = f1_score(test_class_preds,y_test)
test_roc_score_lr = roc_auc_score(test_class_preds,y_test)
print("The accuracy on test data is ", test_accuracy_lr)
print("The precision on test data is ", test_precision_score_lr)
print("The recall on test data is ", test_recall_score_lr)
print("The f1 on test data is ", test_f1_score_lr)
print("The roc_score on test data is ", test_roc_score_lr)
The accuracy on test data is 0.49957849685493805 The precision on test data is 0.0006485084306095979 The recall on test data is 0.29411764705882354 The f1 on test data is 0.0012941633234114145 The roc_score on test data is 0.3969614462248156
cm_lr = confusion_matrix(y_train, train_class_preds)
print(cm_lr)
[[15636 17] [15646 8]]
labels = ['Not Defaulter', 'Defaulter']
ax= plt.subplot()
sns.heatmap(cm_lr, annot=True, ax = ax) #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
[Text(0, 0.5, 'Not Defaulter'), Text(0, 1.5, 'Defaulter')]
len(optimal_rf_clf.feature_importances_)
82
train_class_preds = optimal_rf_clf.predict(X_train)
test_class_preds = optimal_rf_clf.predict(X_test)
y_preds_proba_rf = optimal_rf_clf.predict_proba(X_test)[::,1]
def G(v):
bins = np.linspace(0., 100., 11)
total = float(np.sum(v))
yvals = []
for b in bins:
bin_vals = v[v <= np.percentile(v, b)]
bin_fraction = (np.sum(bin_vals) / total) * 100.0
yvals.append(bin_fraction)
# perfect equality area
pe_area = np.trapz(bins, x=bins)
# lorenz area
lorenz_area = np.trapz(yvals, x=bins)
gini_val = (pe_area - lorenz_area) / float(pe_area)
return bins, yvals, gini_val
bins, result, gini_val = G(y_preds_proba_rf)
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(bins, result, label="observed")
plt.plot(bins, bins, '--', label="perfect eq.")
plt.xlabel("fraction of population")
plt.ylabel("fraction of wealth")
plt.title("GINI: %.4f" %(gini_val))
Text(0.5, 1.0, 'GINI: 0.1057')
import sklearn.metrics as metrics
y_pred_proba = y_preds_proba_rf
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()